library(data.table)
if (!file.exists("met_all.gz"))
download.file(
url = "https://raw.githubusercontent.com/USCbiostats/data-science-data/master/02_met/met_all.gz",
destfile = "met_all.gz",
method = "libcurl",
timeout = 60
)
met <- data.table::fread("met_all.gz")Lab4 KessieSHEN
1.Read in the data
2.Prepare the Data
library(tidyverse)── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.4 ✔ readr 2.1.5
✔ forcats 1.0.0 ✔ stringr 1.5.1
✔ ggplot2 3.5.1 ✔ tibble 3.2.1
✔ lubridate 1.9.3 ✔ tidyr 1.3.1
✔ purrr 1.0.2
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::between() masks data.table::between()
✖ dplyr::filter() masks stats::filter()
✖ dplyr::first() masks data.table::first()
✖ lubridate::hour() masks data.table::hour()
✖ lubridate::isoweek() masks data.table::isoweek()
✖ dplyr::lag() masks stats::lag()
✖ dplyr::last() masks data.table::last()
✖ lubridate::mday() masks data.table::mday()
✖ lubridate::minute() masks data.table::minute()
✖ lubridate::month() masks data.table::month()
✖ lubridate::quarter() masks data.table::quarter()
✖ lubridate::second() masks data.table::second()
✖ purrr::transpose() masks data.table::transpose()
✖ lubridate::wday() masks data.table::wday()
✖ lubridate::week() masks data.table::week()
✖ lubridate::yday() masks data.table::yday()
✖ lubridate::year() masks data.table::year()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(data.table)
met <- met[temp>-17]
head(met) USAFID WBAN year month day hour min lat lon elev wind.dir
<int> <int> <int> <int> <int> <int> <int> <num> <num> <int> <int>
1: 690150 93121 2019 8 1 0 56 34.3 -116.166 696 220
2: 690150 93121 2019 8 1 1 56 34.3 -116.166 696 230
3: 690150 93121 2019 8 1 2 56 34.3 -116.166 696 230
4: 690150 93121 2019 8 1 3 56 34.3 -116.166 696 210
5: 690150 93121 2019 8 1 4 56 34.3 -116.166 696 120
6: 690150 93121 2019 8 1 5 56 34.3 -116.166 696 NA
wind.dir.qc wind.type.code wind.sp wind.sp.qc ceiling.ht ceiling.ht.qc
<char> <char> <num> <char> <int> <int>
1: 5 N 5.7 5 22000 5
2: 5 N 8.2 5 22000 5
3: 5 N 6.7 5 22000 5
4: 5 N 5.1 5 22000 5
5: 5 N 2.1 5 22000 5
6: 9 C 0.0 5 22000 5
ceiling.ht.method sky.cond vis.dist vis.dist.qc vis.var vis.var.qc temp
<char> <char> <int> <char> <char> <char> <num>
1: 9 N 16093 5 N 5 37.2
2: 9 N 16093 5 N 5 35.6
3: 9 N 16093 5 N 5 34.4
4: 9 N 16093 5 N 5 33.3
5: 9 N 16093 5 N 5 32.8
6: 9 N 16093 5 N 5 31.1
temp.qc dew.point dew.point.qc atm.press atm.press.qc rh
<char> <num> <char> <num> <int> <num>
1: 5 10.6 5 1009.9 5 19.88127
2: 5 10.6 5 1010.3 5 21.76098
3: 5 7.2 5 1010.6 5 18.48212
4: 5 5.0 5 1011.6 5 16.88862
5: 5 5.0 5 1012.7 5 17.38410
6: 5 5.6 5 1012.7 5 20.01540
library(dplyr)
key_variables <- c("temp", "rh", "wind.sp", "vis.dist", "dew.point", "lat", "lon", "elev")
for (var in key_variables) {
met[get(var) %in% c(9999, 999), (var) := NA]
}
met[, date := as.Date(paste(year, month, day, sep = "-"))]
setDT(met)
met <- met[date >= as.Date("2019-08-01") & date <= as.Date("2019-08-07")]
met <- as.data.table(met)
# Compute mean values by station
means_by_station <- met[, .(
temp_mean = mean(temp, na.rm = TRUE),
rh_mean = mean(rh, na.rm = TRUE),
wind_sp_mean = mean(wind.sp, na.rm = TRUE),
vis_dist_mean = mean(vis.dist, na.rm = TRUE),
dew_point_mean = mean(dew.point, na.rm = TRUE),
lat_mean = mean(lat, na.rm = TRUE),
lon_mean = mean(lon, na.rm = TRUE),
elev_mean = mean(elev, na.rm = TRUE)
), by = .(USAFID)]
# Create a region variable for NW, SW, NE, SE
met[, region := fifelse(lat > 39.71 & lon < -98.00, "NE",
fifelse(lat > 39.71 & lon >= -98.00, "NW",
fifelse(lat <= 39.71 & lon < -98.00, "SE", "SW")))]
#met_avg[, elev.cat := fifelse(elev>252, "high", "low")]3.Use geom_violin to examine the wind speed and dew point by region
library(ggplot2)
ggplot(met, aes(x = factor(1), y = wind.sp, fill = region)) +
geom_violin(trim = TRUE) +
facet_wrap(~ region) + # Use facets to separate regions
labs(title = "Wind Speed by Region",
x = "Region",
y = "Wind Speed") +
theme_minimal() +
theme(legend.position = "bottom")Warning: Removed 8515 rows containing non-finite outside the scale range
(`stat_ydensity()`).

ggplot(met, aes(x = factor(1), y = dew.point, fill = region)) +
geom_violin(trim = TRUE) +
facet_wrap(~ region) + # Use facets to separate regions
labs(title = "Dew Point by Region",
x = "Region",
y = "Dew Point") +
theme_minimal() +
theme(legend.position = "bottom")Warning: Removed 1227 rows containing non-finite outside the scale range
(`stat_ydensity()`).

region_colors <- c("NW" = "blue", "SW" = "green", "NE" = "red", "SE" = "purple")4.Use geom_jitter with stat_smooth to examine the association between dew point and wind speed by region
#Deal with NA before coloring points by region
met_clean <- na.omit(met, cols = c("dew.point", "wind.sp", "region"))
ggplot(met_clean, aes(x = dew.point, y = wind.sp, color = region)) +
geom_jitter(width = 0.3, height = 0.3) + # Adds a small amount of noise to each point
stat_smooth(method = "lm", se = FALSE) + # Fit linear regression lines
labs(title = "Association between Dew Point and Wind Speed by Region",
x = "Dew Point (°C)", y = "Wind Speed (m/s)") +
theme_minimal()`geom_smooth()` using formula = 'y ~ x'

Southeast Region: This region often has higher dew points Northwest Region :Dew points may not correlate as strongly with wind speed
**5.Use geom_bar to create barplots of the weather stations by elevation category colored by region
#bar plot of weather stations by elevation category
library(ggplot2)
ggplot(met, aes(x =factor(1), fill = region)) +
geom_bar(position = "dodge") +
labs(title = "Weather Stations by Elevation Category", x = "Elevation", y = "Count") +
scale_fill_brewer(palette = "Set1") +
theme_minimal()
a higher count of stations in the “Low” elevation category compared to High.
**6.Use stat_summary to examine mean dew point and wind speed by region with standard deviation error bars
ggplot(met_clean, aes(x = region, y = dew.point)) +
stat_summary(fun.data = "mean_sdl", geom = "pointrange") +
labs(title = "Mean Dew Point by Region", x = "Region", y = "Dew Point (°C)") +
theme_minimal()
# Adding error bars for wind speed
ggplot(met_clean, aes(x = region, y = wind.sp)) +
stat_summary(fun.data = "mean_sdl", geom = "pointrange") +
labs(title = "Mean Wind Speed by Region", x = "Region", y = "Wind Speed (m/s)") +
theme_minimal()
the average wind speeds can vary greatly depending on the geographical location of a region. Coastal areas, typically experience higher average wind speeds, inland areas, like the Midwest, often have lower average wind speeds
**7.Make a map showing the spatial trend in relative humidity in the US
library(leaflet)
met_clean_rh <- na.omit(met, cols = "rh")
pal <- colorNumeric(palette = "Blues", domain = met_clean_rh$rh)
leaflet(data = met_clean_rh) %>%
addTiles() %>%
addCircleMarkers(~lon, ~lat, color = ~pal(rh), radius = 3,
popup = ~paste("Station:", USAFID, "<br>RH:", rh),
group = "Humidity") %>%
addLegend(pal = pal, values = ~rh, title = "Relative Humidity (%)") %>%
setView(lng = -98, lat = 39.5, zoom = 4)